From d32e115cb8c43fe4531567b1c668dba6dc76274d Mon Sep 17 00:00:00 2001
From: mru <mru@9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
Date: Fri, 14 Aug 2009 01:02:06 +0000
Subject: [PATCH 14/27] ARM: NEON optimised vorbis_inverse_coupling

12% faster Vorbis decoding on Cortex-A8.

git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19637 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
---
 libavcodec/arm/dsputil_neon.c   |    5 +++
 libavcodec/arm/dsputil_neon_s.S |   64 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 0 deletions(-)

diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
index 20425c1..eb9aba1 100644
--- a/libavcodec/arm/dsputil_neon.c
+++ b/libavcodec/arm/dsputil_neon.c
@@ -161,6 +161,8 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
 void ff_float_to_int16_neon(int16_t *, const float *, long);
 void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
 
+void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
+
 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
     c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
@@ -270,4 +272,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
         c->float_to_int16 = ff_float_to_int16_neon;
         c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
     }
+
+    if (CONFIG_VORBIS_DECODER)
+        c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
 }
diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
index 303b11c..2bc07fa 100644
--- a/libavcodec/arm/dsputil_neon_s.S
+++ b/libavcodec/arm/dsputil_neon_s.S
@@ -19,6 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
 #include "asm.S"
 
         preserve8
@@ -793,3 +794,66 @@ function ff_vector_fmul_window_neon, export=1
         vst1.64         {d22,d23},[ip,:128], r5
         pop             {r4,r5,pc}
         .endfunc
+
+#if CONFIG_VORBIS_DECODER
+function ff_vorbis_inverse_coupling_neon, export=1
+        vmov.i32        q10, #1<<31
+        subs            r2,  r2,  #4
+        tst             r2,  #4
+        mov             r3,  r0
+        mov             r12, r1
+        beq             3f
+
+        vld1.32         {d24-d25},[r1,:128]!
+        vld1.32         {d22-d23},[r0,:128]!
+        vcle.s32        q8,  q12, #0
+        vand            q9,  q11, q10
+        veor            q12, q12, q9
+        vand            q2,  q12, q8
+        vbic            q3,  q12, q8
+        vadd.f32        q12, q11, q2
+        vsub.f32        q11, q11, q3
+1:      vld1.32         {d2-d3},  [r1,:128]!
+        vld1.32         {d0-d1},  [r0,:128]!
+        vcle.s32        q8,  q1,  #0
+        vand            q9,  q0,  q10
+        veor            q1,  q1,  q9
+        vst1.32         {d24-d25},[r3, :128]!
+        vst1.32         {d22-d23},[r12,:128]!
+        vand            q2,  q1,  q8
+        vbic            q3,  q1,  q8
+        vadd.f32        q1,  q0,  q2
+        vsub.f32        q0,  q0,  q3
+        subs            r2,  r2,  #8
+        ble             2f
+        vld1.32         {d24-d25},[r1,:128]!
+        vld1.32         {d22-d23},[r0,:128]!
+        vcle.s32        q8,  q12, #0
+        vand            q9,  q11, q10
+        veor            q12, q12, q9
+        vst1.32         {d2-d3},  [r3, :128]!
+        vst1.32         {d0-d1},  [r12,:128]!
+        vand            q2,  q12, q8
+        vbic            q3,  q12, q8
+        vadd.f32        q12, q11, q2
+        vsub.f32        q11, q11, q3
+        b               1b
+
+2:      vst1.32         {d2-d3},  [r3, :128]!
+        vst1.32         {d0-d1},  [r12,:128]!
+        bxlt            lr
+
+3:      vld1.32         {d2-d3},  [r1,:128]
+        vld1.32         {d0-d1},  [r0,:128]
+        vcle.s32        q8,  q1,  #0
+        vand            q9,  q0,  q10
+        veor            q1,  q1,  q9
+        vand            q2,  q1,  q8
+        vbic            q3,  q1,  q8
+        vadd.f32        q1,  q0,  q2
+        vsub.f32        q0,  q0,  q3
+        vst1.32         {d2-d3},  [r0,:128]!
+        vst1.32         {d0-d1},  [r1,:128]!
+        bx              lr
+        .endfunc
+#endif
-- 
1.6.3.3

